In [3]:
import pandas as pd
movies = pd.read_csv("fandango_score_comparison.csv")
def preview(df):
print("Dimensions: {0} rows x {1} columns".format(df.shape[0], df.shape[1]))
return df.head()
preview(movies)
Out[3]:
In [7]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(movies["Metacritic_norm_round"])
plt.show()
plt.hist(movies["Fandango_Stars"], bins=5)
plt.show()
Metacritic ranges from 0.5-4.5 while Fandango ranges from 3.0-5.0, which is pretty biased.
In [14]:
import numpy as np
print("Metacritic mean: {0}".format(movies["Metacritic_norm_round"].mean()))
print("Metacritic median: {0}".format(movies["Metacritic_norm_round"].median()))
print("Metacritic standard deviation: {0}".format(np.std(movies["Metacritic_norm_round"])))
print("Fandango mean: {0}".format(movies["Fandango_Stars"].mean()))
print("Fandango median: {0}".format(movies["Fandango_Stars"].median()))
print("Fandango standard deviation: {0}".format(np.std(movies["Fandango_Stars"])))
In [16]:
plt.scatter(movies["Metacritic_norm_round"], movies["Fandango_Stars"])
plt.show()
In [21]:
movies["fm_diff"] = np.absolute(movies["Metacritic_norm_round"] - movies["Fandango_Stars"])
movies["fm_diff"].head()
Out[21]:
In [27]:
movies.sort_values(by="fm_diff", ascending=False).head()
Out[27]:
In [33]:
from scipy import stats
corr, p = stats.pearsonr(movies["Fandango_Stars"], movies["Metacritic_norm_round"])
print("Correlation: {0}".format(corr))
In [38]:
slope, intercept, r_value, p_value, std_err = stats.linregress(movies["Metacritic_norm_round"], movies["Fandango_Stars"])
print("Metacritic: 3.0, predicted Fandango: {0}".format(slope * 3.0 + intercept))
print("Metacritic: 4.0, predicted Fandango: {0}".format(slope * 4.0 + intercept))
In [42]:
plt.scatter(movies["Metacritic_norm_round"], movies["Fandango_Stars"])
x = [3.0, 4.0]
y = [4.09, 4.19]
plt.plot(x, y)
plt.show()
In [ ]: